{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# `vaex` @ EuroScipy Bilbao 2019\n",
    "\n",
    "## New York Taxi Dataset (2009-2015): Exploratory Data Analysis and Machine Learning example\n",
    "\n",
    "https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:01:03.156091Z",
     "start_time": "2019-09-03T21:01:01.756744Z"
    }
   },
   "outputs": [],
   "source": [
    "import vaex\n",
    "from vaex.ui.colormaps import cm_plusmin\n",
    "import warnings; warnings.simplefilter('ignore')\n",
    "\n",
    "\n",
    "import pylab as plt\n",
    "import numpy as np\n",
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:01:05.140682Z",
     "start_time": "2019-09-03T21:01:05.015565Z"
    }
   },
   "outputs": [],
   "source": [
    "!ls -lh /data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:01:36.458604Z",
     "start_time": "2019-09-03T21:01:36.412355Z"
    }
   },
   "outputs": [],
   "source": [
    "df_gaia = vaex.open('/data/gaia-dr2-sort-by-source_id.hdf5')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:01:41.196937Z",
     "start_time": "2019-09-03T21:01:41.088977Z"
    }
   },
   "outputs": [],
   "source": [
    "df_gaia"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Open the dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:02:12.867478Z",
     "start_time": "2019-09-03T21:02:12.787256Z"
    }
   },
   "outputs": [],
   "source": [
    "# Opens the data\n",
    "# df = vaex.open('/data/yellow_taxi_2009_2015_f32.hdf5')\n",
    "df = vaex.open('s3://vaex/taxi/yellow_taxi_2009_2015_f32.hdf5')\n",
    "\n",
    "# Check length\n",
    "print(f'Number of samples in the data: {len(df):,}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:02:21.706319Z",
     "start_time": "2019-09-03T21:02:21.676802Z"
    }
   },
   "outputs": [],
   "source": [
    "# View a portion of the dataset\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Split the data into train & test sets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:02:32.295442Z",
     "start_time": "2019-09-03T21:02:32.289299Z"
    }
   },
   "outputs": [],
   "source": [
    "# Train / test split (by date)\n",
    "df_train = df[:1_026_944_937]\n",
    "df_test = df[1_026_944_937:]\n",
    "\n",
    "print(f'Number of samples in the training set: {len(df_train):,}')\n",
    "print(f'Number of samples in the test set:       {len(df_test):,}')\n",
    "\n",
    "# Check if the lengths of the datasets match\n",
    "assert len(df) == len(df_test) + len(df_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:02:40.145267Z",
     "start_time": "2019-09-03T21:02:40.124484Z"
    }
   },
   "outputs": [],
   "source": [
    "df_train"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Basic view in the contents of the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:04:15.273193Z",
     "start_time": "2019-09-03T21:02:48.864557Z"
    }
   },
   "outputs": [],
   "source": [
    "# Basic description about the training dataset\n",
    "df_train.describe()\n",
    "# 9 min"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Remove missing data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:06:27.832089Z",
     "start_time": "2019-09-03T21:06:27.511702Z"
    }
   },
   "outputs": [],
   "source": [
    "# Drop NANs\n",
    "df_train = df_train.dropna(column_names=['dropoff_latitude', 'dropoff_longitude', 'pickup_latitude'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:06:36.968023Z",
     "start_time": "2019-09-03T21:06:32.910903Z"
    }
   },
   "outputs": [],
   "source": [
    "df_train"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Abnormal number of passengers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:07:19.100315Z",
     "start_time": "2019-09-03T21:07:17.905106Z"
    }
   },
   "outputs": [],
   "source": [
    "# Number of passengers\n",
    "df_train.passenger_count.value_counts(progress=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:07:50.986547Z",
     "start_time": "2019-09-03T21:07:50.653059Z"
    }
   },
   "outputs": [],
   "source": [
    "# Filter abnormal number of passengers\n",
    "df_train = df_train[(df_train.passenger_count>0) & (df_train.passenger_count<7)]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Clean up distance values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:08:07.349170Z",
     "start_time": "2019-09-03T21:08:02.170956Z"
    }
   },
   "outputs": [],
   "source": [
    "plt.figure(figsize=(8,4))\n",
    "df_train.plot1d('trip_distance', limits='minmax', f='log1p')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:08:22.463419Z",
     "start_time": "2019-09-03T21:08:21.402879Z"
    }
   },
   "outputs": [],
   "source": [
    "# How many trips have 0.0 distance?\n",
    "(df_train.trip_distance==0).astype('int').sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:08:31.894574Z",
     "start_time": "2019-09-03T21:08:31.133419Z"
    }
   },
   "outputs": [],
   "source": [
    "# What is the largest distance?\n",
    "_ = df_train.trip_distance.max()\n",
    "\n",
    "print(_, 'miles.')\n",
    "\n",
    "print('This is %3.1f times larger than the distance between the Earth and the Moon!' % (_ / 238_900))\n",
    "print('or')\n",
    "print('This is %1.1f the distance to Mars!' % (_ / 33_900_000))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:08:46.972627Z",
     "start_time": "2019-09-03T21:08:46.078368Z"
    }
   },
   "outputs": [],
   "source": [
    "plt.figure(figsize=(8,4))\n",
    "df_train.plot1d('trip_distance', limits=[0, 20], f=None)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:08:53.739157Z",
     "start_time": "2019-09-03T21:08:53.196681Z"
    }
   },
   "outputs": [],
   "source": [
    "# Filter negative and too large distances\n",
    "df_train = df_train[(df_train.trip_distance>0) & (df_train.trip_distance<10)]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### What _is_ New York City really?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:09:03.670191Z",
     "start_time": "2019-09-03T21:08:59.301716Z"
    }
   },
   "outputs": [],
   "source": [
    "# Interactively plot the pickup locations\n",
    "df_train.plot_widget(df_train.pickup_longitude, \n",
    "                     df_train.pickup_latitude, \n",
    "                     shape=512, \n",
    "                     f='log1p', \n",
    "                     colormap='plasma', \n",
    "                     limits='minmax')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:09:46.047518Z",
     "start_time": "2019-09-03T21:09:46.045366Z"
    }
   },
   "outputs": [],
   "source": [
    "# Define the NYC boundaries\n",
    "long_min = -74.05\n",
    "long_max = -73.75\n",
    "lat_min = 40.58\n",
    "lat_max = 40.90"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:09:48.456660Z",
     "start_time": "2019-09-03T21:09:48.033007Z"
    }
   },
   "outputs": [],
   "source": [
    "# Make a selection based on the boundaries\n",
    "df_train = df_train[(df_train.pickup_longitude > long_min)  & (df_train.pickup_longitude < long_max) & \\\n",
    "        (df_train.pickup_latitude > lat_min)    & (df_train.pickup_latitude < lat_max) & \\\n",
    "        (df_train.dropoff_longitude > long_min) & (df_train.dropoff_longitude < long_max) & \\\n",
    "        (df_train.dropoff_latitude > lat_min)   & (df_train.dropoff_latitude < lat_max)]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create some features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:10:03.112886Z",
     "start_time": "2019-09-03T21:10:02.766247Z"
    }
   },
   "outputs": [],
   "source": [
    "df_train['trip_speed_mph'] = df_train.trip_distance / ((df_train.dropoff_datetime - df_train.pickup_datetime) / np.timedelta64(1, 'h'))\n",
    "df_train['trip_duration_min'] = (df_train.dropoff_datetime - df_train.pickup_datetime) / np.timedelta64(1, 'm')\n",
    "df_train['fare_by_distance'] = (df_train.fare_amount / df_train.trip_distance)\n",
    "\n",
    "# Filter, keep durations that are within 2 hours\n",
    "df_train = df_train[(df_train.trip_duration_min>0) & (df_train.trip_duration_min<120)]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create some date/time features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:10:26.322929Z",
     "start_time": "2019-09-03T21:10:26.279433Z"
    }
   },
   "outputs": [],
   "source": [
    "# Daily activities\n",
    "df_train['pickup_hour'] = df_train.pickup_datetime.dt.hour\n",
    "df_train['pickup_day_of_week'] = df_train.pickup_datetime.dt.dayofweek\n",
    "df_train['pickup_month'] = df_train.pickup_datetime.dt.month - 1\n",
    "df_train['pickup_is_weekend'] = (df_train.pickup_day_of_week>=5).astype('int')\n",
    "\n",
    "# lists to help with the labeling\n",
    "weekday_names_list = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']\n",
    "month_names_list = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'July', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:10:38.505505Z",
     "start_time": "2019-09-03T21:10:38.501651Z"
    }
   },
   "outputs": [],
   "source": [
    "# Treat these columns as label/ordinal encoded values\n",
    "df_train.categorize(column='pickup_hour', labels=[str(k) for k in range(24)], check=False)\n",
    "df_train.categorize(column='pickup_day_of_week', labels=weekday_names_list, check=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:11:03.445221Z",
     "start_time": "2019-09-03T21:10:45.945937Z"
    }
   },
   "outputs": [],
   "source": [
    "# Number of pick-ups per hour for a given day of the week\n",
    "df_train.plot('pickup_hour', 'pickup_day_of_week', colorbar=True, colormap=cm_plusmin, figsize=(15, 5))\n",
    "plt.xticks(np.arange(24), np.arange(24))\n",
    "plt.yticks(np.arange(7), weekday_names_list)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Groupby examples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:11:59.595219Z",
     "start_time": "2019-09-03T21:11:40.683754Z"
    }
   },
   "outputs": [],
   "source": [
    "df_per_hour = df_train.groupby(by=df_train.pickup_hour).agg({'tip_amount': 'mean',\n",
    "                                                         'trip_speed_mph': 'mean',\n",
    "                                                        })\n",
    "\n",
    "# Display the grouped DataFrame\n",
    "df_per_hour"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:12:00.104206Z",
     "start_time": "2019-09-03T21:11:59.596442Z"
    }
   },
   "outputs": [],
   "source": [
    "plt.figure(figsize=(14, 5))\n",
    "\n",
    "plt.subplot(121)\n",
    "sns.barplot(x=df_per_hour.pickup_hour.values, y=df_per_hour.tip_amount.values)\n",
    "plt.title('Mean tip amount')\n",
    "plt.xlabel('hour of day')\n",
    "plt.ylabel('mean tip amount')\n",
    "\n",
    "plt.subplot(122)\n",
    "sns.barplot(x=df_per_hour.pickup_hour.values, y=df_per_hour.trip_speed_mph.values)\n",
    "plt.title('Mean trip speed')\n",
    "plt.xlabel('hour of day')\n",
    "plt.ylabel('mean trip speed [miles per hour]')\n",
    "\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:13:00.267613Z",
     "start_time": "2019-09-03T21:12:48.674008Z"
    }
   },
   "outputs": [],
   "source": [
    "df_joined = df_train.join(df_per_hour, on='pickup_hour', rprefix=\"right_\")\n",
    "df_joined"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T19:54:17.241250Z",
     "start_time": "2019-09-03T19:54:15.651479Z"
    }
   },
   "outputs": [],
   "source": [
    "# All the filtering, in case something went wrong, which NEVER happens\n",
    "import vaex\n",
    "import numpy as np\n",
    "import pylab as plt\n",
    "# df = vaex.open('/data/yellow_taxi_2009_2015_f32.hdf5')\n",
    "df = vaex.open('s3://vaex/taxi/yellow_taxi_2009_2015_f32.hdf5')\n",
    "\n",
    "magic_row_nr = 1_026_944_937\n",
    "df_train, df_test = df[:magic_row_nr], df[magic_row_nr:]\n",
    "df_train = df_train.dropna(column_names=['dropoff_latitude', 'dropoff_longitude', 'pickup_latitude'])\n",
    "df_train = df_train[(df_train.passenger_count>0) & (df_train.passenger_count<7)]\n",
    "df_train = df_train[(df_train.trip_distance>0) & (df_train.trip_distance<10)]\n",
    "# Define the NYC boundaries\n",
    "long_min = -74.05\n",
    "long_max = -73.75\n",
    "lat_min = 40.58\n",
    "lat_max = 40.90\n",
    "# Make a selection based on the boundaries\n",
    "df_train = df_train[(df_train.pickup_longitude > long_min)  & (df_train.pickup_longitude < long_max) & \\\n",
    "        (df_train.pickup_latitude > lat_min)    & (df_train.pickup_latitude < lat_max) & \\\n",
    "        (df_train.dropoff_longitude > long_min) & (df_train.dropoff_longitude < long_max) & \\\n",
    "        (df_train.dropoff_latitude > lat_min)   & (df_train.dropoff_latitude < lat_max)]\n",
    "df_train['pu_hour'] = df_train.pickup_datetime.dt.hour\n",
    "df_train['pu_day_of_week'] = df_train.pickup_datetime.dt.dayofweek\n",
    "df_train['pu_month'] = df_train.pickup_datetime.dt.month - 1\n",
    "df_train['pu_is_weekend'] = (df_train.pu_day_of_week>=5).astype('int')\n",
    "df_train['trip_speed_mph'] = df_train.trip_distance / ((df_train.dropoff_datetime - df_train.pickup_datetime) / np.timedelta64(1, 'h'))\n",
    "df_train['trip_duration_min'] = (df_train.dropoff_datetime - df_train.pickup_datetime) / np.timedelta64(1, 'm')\n",
    "df_train['fare_by_distance'] = (df_train.fare_amount / df_train.trip_distance)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Predictive modelling example: predict the likely duration of a taxi trip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T20:46:58.541850Z",
     "start_time": "2019-09-03T20:46:56.854321Z"
    }
   },
   "outputs": [],
   "source": [
    "df_train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:13:44.462572Z",
     "start_time": "2019-09-03T21:13:44.414272Z"
    }
   },
   "outputs": [],
   "source": [
    "import vaex.ml"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:14:00.594898Z",
     "start_time": "2019-09-03T21:14:00.585670Z"
    }
   },
   "outputs": [],
   "source": [
    "# arc-distance in miles\n",
    "def arc_distance(theta_1, phi_1, theta_2, phi_2):\n",
    "    temp = (np.sin((theta_2-theta_1)/2*np.pi/180)**2\n",
    "           + np.cos(theta_1*np.pi/180)*np.cos(theta_2*np.pi/180) * np.sin((phi_2-phi_1)/2*np.pi/180)**2)\n",
    "    distance = 2 * np.arctan2(np.sqrt(temp), np.sqrt(1-temp))\n",
    "    return distance * 3958.8\n",
    "\n",
    "# distance London - Utrecht [miles]\n",
    "arc_distance(51.5069797, -0.1295992, 52.0842715, 5.0124523)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:14:10.363166Z",
     "start_time": "2019-09-03T21:14:10.357768Z"
    }
   },
   "outputs": [],
   "source": [
    "# Add the arc-distance in miles as a virtual column\n",
    "df_train['arc_distance_miles_numpy'] = arc_distance(df_train.pickup_longitude, df_train.pickup_latitude, \n",
    "                                              df_train.dropoff_longitude, df_train.dropoff_latitude)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:14:39.290767Z",
     "start_time": "2019-09-03T21:14:16.846199Z"
    }
   },
   "outputs": [],
   "source": [
    "df_train['arc_distance_miles_numpy'].sum(progress=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:14:40.208769Z",
     "start_time": "2019-09-03T21:14:39.640298Z"
    }
   },
   "outputs": [],
   "source": [
    "df_train['arc_distance_miles_cuda'] = df_train['arc_distance_miles_numpy'].jit_cuda()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:14:51.645866Z",
     "start_time": "2019-09-03T21:14:44.971110Z"
    }
   },
   "outputs": [],
   "source": [
    "df_train['arc_distance_miles_cuda'].sum(progress=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:15:01.520135Z",
     "start_time": "2019-09-03T21:15:01.518085Z"
    }
   },
   "outputs": [],
   "source": [
    "# choose GPU or CPU\n",
    "df_train['arc_distance_miles'] = df_train['arc_distance_miles_cuda']\n",
    "# df_train['arc_distance_miles'] = df_train['arc_distance_miles_numpy']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:15:08.168746Z",
     "start_time": "2019-09-03T21:15:07.866469Z"
    }
   },
   "outputs": [],
   "source": [
    "# direction of travel in degrees\n",
    "def direction_angle(theta_1, phi_1, theta_2, phi_2):\n",
    "    dtheta = theta_2 - theta_1\n",
    "    dphi = phi_2 - phi_1\n",
    "    radians = np.arctan2(dtheta, dphi)\n",
    "    return np.rad2deg(radians)\n",
    "\n",
    "# The direction of travel\n",
    "df_train['direction_angle'] = direction_angle(df_train.pickup_longitude, df_train.pickup_latitude, \n",
    "                                           df_train.dropoff_longitude, df_train.dropoff_latitude).jit_numba()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:15:11.705590Z",
     "start_time": "2019-09-03T21:15:11.353999Z"
    }
   },
   "outputs": [],
   "source": [
    "# Examine the train DataFrame at this point \n",
    "df_train.head(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Encoding and transforming of features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:15:57.508330Z",
     "start_time": "2019-09-03T21:15:36.848072Z"
    }
   },
   "outputs": [],
   "source": [
    "# PCA of the pickup and dropoff locations - helps to \"straighten out\" the coordinates\n",
    "\n",
    "# pickup transformations\n",
    "pca_pu = vaex.ml.PCA(features=['pickup_longitude', 'pickup_latitude'], n_components=2)\n",
    "df_train = pca_pu.fit_transform(df_train)\n",
    "\n",
    "# dropoff transformations\n",
    "pca_do = vaex.ml.PCA(features=['dropoff_longitude', 'dropoff_latitude'], n_components=2)\n",
    "df_train = pca_do.fit_transform(df_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:16:28.906297Z",
     "start_time": "2019-09-03T21:16:01.176402Z"
    }
   },
   "outputs": [],
   "source": [
    "plt.figure(figsize=(14, 5))\n",
    "\n",
    "plt.subplot(121)\n",
    "plt.title('pickup - original')\n",
    "df_train.plot(df_train.pickup_longitude, df_train.pickup_latitude,\n",
    "           colormap='plasma', f='log1p', shape=256, colorbar=False)\n",
    "\n",
    "plt.subplot(122)\n",
    "plt.title('pickup - PCA transformed')\n",
    "df_train.plot(df_train.PCA_0, df_train.PCA_1,\n",
    "           colormap='plasma', f='log1p', shape=256, colorbar=False)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:16:36.312247Z",
     "start_time": "2019-09-03T21:16:35.998600Z"
    }
   },
   "outputs": [],
   "source": [
    "# inspect the DataFrame\n",
    "df_train.head(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Setting up the predictor - `LightGBM`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:16:52.126301Z",
     "start_time": "2019-09-03T21:16:52.123838Z"
    }
   },
   "outputs": [],
   "source": [
    "features_lgbm = ['passenger_count', 'trip_distance', 'rate_code', 'pickup_hour', 'pickup_day_of_week', 'pickup_is_weekend', \n",
    "                 'arc_distance_miles', 'direction_angle', 'PCA_0', 'PCA_1', 'PCA_2', 'PCA_3']\n",
    "\n",
    "\n",
    "# the target\n",
    "target = 'trip_duration_min'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:16:56.095322Z",
     "start_time": "2019-09-03T21:16:55.802847Z"
    }
   },
   "outputs": [],
   "source": [
    "# Import the modeling library\n",
    "import vaex.ml.lightgbm # vaex.ml also supports XGBoost, CatBoost, scikit-learn, annoy, more to come)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:17:05.149982Z",
     "start_time": "2019-09-03T21:17:02.314324Z"
    }
   },
   "outputs": [],
   "source": [
    "# parameters - standard lightgbm options\n",
    "params = {\n",
    "    'learning_rate': 0.1,       \n",
    "    'max_depth': 5,             \n",
    "    'colsample_bytree': 0.8,\n",
    "    'subsample': 0.8,           \n",
    "    'reg_lambda': 1,            \n",
    "    'reg_alpha': 0,             \n",
    "    'min_child_weight': 1,      \n",
    "    'objective': 'regression',  \n",
    "    'random_state': 42,         \n",
    "    'n_jobs': -1} \n",
    "\n",
    "# Instantiate the model object\n",
    "booster = vaex.ml.lightgbm.LightGBMModel(features=features_lgbm, params=params, num_boost_round=100)\n",
    "\n",
    "# Take small part of the training set to we can do the training in real time fast\n",
    "df_train_mini = df_train[:1_000_000]\n",
    "\n",
    "# Fit the model object\n",
    "booster.fit(df_train_mini, target=target)\n",
    "\n",
    "print('Training completed!')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:17:12.824300Z",
     "start_time": "2019-09-03T21:17:11.470059Z"
    }
   },
   "outputs": [],
   "source": [
    "# Check performance on the training set - in reality one needs to do proper (x)-validation\n",
    "\n",
    "# Classical predict - get an in-memory array of the predictions\n",
    "pred = booster.predict(df_train_mini)\n",
    "\n",
    "# view the predictions\n",
    "display(pred)\n",
    "\n",
    "# Create a virtual column housing the predictions\n",
    "df_train = booster.transform(df_train)\n",
    "\n",
    "# view the DataFrame\n",
    "df_train.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:17:33.378417Z",
     "start_time": "2019-09-03T21:17:33.337404Z"
    }
   },
   "outputs": [],
   "source": [
    "# Check the performance\n",
    "from sklearn.metrics import mean_absolute_error, mean_squared_error\n",
    "\n",
    "mae_train_score = mean_absolute_error(df_train_mini.trip_duration_min.values, pred)\n",
    "mse_train_score = mean_squared_error(df_train_mini.trip_duration_min.values, pred)\n",
    "\n",
    "print('The mean absolute error is %2.3f' % mae_train_score)\n",
    "print('The mean squared score is %2.3f' % mse_train_score)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### So what about a pipeline?"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## The `vaex` `state` – all the pipeline you need!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-03T21:18:15.116377Z",
     "start_time": "2019-09-03T21:18:15.084900Z"
    }
   },
   "outputs": [],
   "source": [
    "# Save the state to disk\n",
    "state = df_train.state_write('./taxi_ml_state.json')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
